In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
from scipy import stats
from plotnine import *
from plotnine.data import mtcars

import warnings
warnings.filterwarnings('ignore')
In [2]:
#importing and reading the dataset 

mv_ds=pd.read_csv("/Users/Divya Dubey/Downloads/movies.csv")
In [3]:
mv_ds.head(5)
Out[3]:
id title genres original_language overview popularity production_companies release_date budget revenue runtime status tagline vote_average vote_count credits keywords poster_path backdrop_path recommendations
0 436270 Black Adam Action-Fantasy-Science Fiction en Nearly 5000 years after he was bestowed with t... 11752.795 New Line Cinema-Flynn Picture Company-Seven Bu... 2022-10-19 200000000.0 368000000.0 125.0 Released The world needed a hero. It got Black Adam. 7.292 2420.0 Dwayne Johnson-Aldis Hodge-Noah Centineo-Sarah... lightning-anti hero-superhero-based on comic-d... /pFlaoHTZeyNkG83vxsAJiGzfSsa.jpg /bQXAqRx2Fgc46uCVWgoPz5L5Dtr.jpg 663712-49046-642721-963954-365297-887731-63113...
1 724495 The Woman King Action-Drama-History en The story of the Agojie the all-female unit of... 4957.725 TriStar Pictures-Entertainment One-JuVee Produ... 2022-09-15 50000000.0 91000000.0 135.0 Released Her reign begins. 7.906 586.0 Viola Davis-Thuso Mbedu-Lashana Lynch-John Boy... africa-arranged marriage-warrior woman-based o... /438QXt1E3WJWb3PqNniK0tAE5c1.jpg /7zQJYV02yehWrQN6NjKsBorqUUS.jpg 49046-436270-913290-619730-882598-913816-80093...
2 829799 Paradise City Crime-Action-Thriller en Renegade bounty hunter Ryan Swan must carve hi... 3133.802 Arcana Studio-308 Enterprises-Yale Productions... 2022-11-11 20000000.0 0.0 93.0 Released NaN 6.153 36.0 John Travolta-Bruce Willis-Blake Jenner-Praya ... NaN /uGuHHS9SWv7MrFhCH6zoGGd7DA8.jpg /au4HUSWDRadIcl9CqySlw1kJMfo.jpg 879444-1018494-1015724-1026706-1032427-945444-...
3 792775 Cop Secret Comedy-Action-Thriller is When Bússi Iceland's toughest cop is forced to... 2447.908 Pegasus Pictures-Stöð 2-SamFilm 2022-05-23 0.0 0.0 100.0 Released To solve this crime, they'll need to break all... 6.375 28.0 Auðunn Blöndal-Egill Einarsson-Steinunn Ólína ... co-workers relationship-bank robbery-lgbt-crim... /jnWyZsaCl3Ke6u6ReSmBRO8S1rX.jpg /sUuzl04qNIYsnwCLQpZ2RSvXA1V.jpg 639933-852448-823625
4 956101 The Eighth Clause Thriller la Kat and Borja appear to be a perfect couple bu... 2259.303 SDB Films-El Hombre Orquesta 2022-04-29 0.0 0.0 0.0 Released NaN 4.600 10.0 Maite Perroni-Manuel Vega-Óscar Jaenada-Jessic... NaN /8tc8eMFAX2SDC1TRu987qFQy8Cl.jpg /kLnqNE9Af5QHyvUxw8cDGhF1ilv.jpg NaN
In [4]:
#Shape of the dataset (no of rows and no of columns)

mv_ds.shape
Out[4]:
(735388, 20)

Data Cleaning¶

In [5]:
#renaming vote_average to imdb_ratings

mv_ds.rename(columns = {'vote_average':'imdb_ratings'}, inplace = True)
mv_ds.head(10)
Out[5]:
id title genres original_language overview popularity production_companies release_date budget revenue runtime status tagline imdb_ratings vote_count credits keywords poster_path backdrop_path recommendations
0 436270 Black Adam Action-Fantasy-Science Fiction en Nearly 5000 years after he was bestowed with t... 11752.795 New Line Cinema-Flynn Picture Company-Seven Bu... 2022-10-19 200000000.0 368000000.0 125.0 Released The world needed a hero. It got Black Adam. 7.292 2420.0 Dwayne Johnson-Aldis Hodge-Noah Centineo-Sarah... lightning-anti hero-superhero-based on comic-d... /pFlaoHTZeyNkG83vxsAJiGzfSsa.jpg /bQXAqRx2Fgc46uCVWgoPz5L5Dtr.jpg 663712-49046-642721-963954-365297-887731-63113...
1 724495 The Woman King Action-Drama-History en The story of the Agojie the all-female unit of... 4957.725 TriStar Pictures-Entertainment One-JuVee Produ... 2022-09-15 50000000.0 91000000.0 135.0 Released Her reign begins. 7.906 586.0 Viola Davis-Thuso Mbedu-Lashana Lynch-John Boy... africa-arranged marriage-warrior woman-based o... /438QXt1E3WJWb3PqNniK0tAE5c1.jpg /7zQJYV02yehWrQN6NjKsBorqUUS.jpg 49046-436270-913290-619730-882598-913816-80093...
2 829799 Paradise City Crime-Action-Thriller en Renegade bounty hunter Ryan Swan must carve hi... 3133.802 Arcana Studio-308 Enterprises-Yale Productions... 2022-11-11 20000000.0 0.0 93.0 Released NaN 6.153 36.0 John Travolta-Bruce Willis-Blake Jenner-Praya ... NaN /uGuHHS9SWv7MrFhCH6zoGGd7DA8.jpg /au4HUSWDRadIcl9CqySlw1kJMfo.jpg 879444-1018494-1015724-1026706-1032427-945444-...
3 792775 Cop Secret Comedy-Action-Thriller is When Bússi Iceland's toughest cop is forced to... 2447.908 Pegasus Pictures-Stöð 2-SamFilm 2022-05-23 0.0 0.0 100.0 Released To solve this crime, they'll need to break all... 6.375 28.0 Auðunn Blöndal-Egill Einarsson-Steinunn Ólína ... co-workers relationship-bank robbery-lgbt-crim... /jnWyZsaCl3Ke6u6ReSmBRO8S1rX.jpg /sUuzl04qNIYsnwCLQpZ2RSvXA1V.jpg 639933-852448-823625
4 956101 The Eighth Clause Thriller la Kat and Borja appear to be a perfect couple bu... 2259.303 SDB Films-El Hombre Orquesta 2022-04-29 0.0 0.0 0.0 Released NaN 4.600 10.0 Maite Perroni-Manuel Vega-Óscar Jaenada-Jessic... NaN /8tc8eMFAX2SDC1TRu987qFQy8Cl.jpg /kLnqNE9Af5QHyvUxw8cDGhF1ilv.jpg NaN
5 505642 Black Panther: Wakanda Forever Action-Adventure-Science Fiction en Queen Ramonda Shuri M’Baku Okoye and the Dora ... 2248.449 Marvel Studios 2022-11-09 250000000.0 733000000.0 162.0 Released Forever. 7.547 1182.0 Letitia Wright-Lupita Nyong'o-Danai Gurira-Win... hero-sequel-superhero-based on comic-duringcre... /ps2oKfhY6DL3alynlSqY97gHSsg.jpg /xDMIl84Qo5Tsu62c9DGWhmPI67A.jpg 436270-785084-928123-663712-555604-615952-7441...
6 948276 Lost Bullet 2 Action-Drama-Thriller fr Having cleared his name genius mechanic Lino h... 2229.672 Versus Production-Nolita-Inoxy Films 2022-11-10 0.0 0.0 98.0 Released NaN 6.650 140.0 Alban Lenoir-Stéfi Celma-Pascale Arbillot-Séba... french film /uAeZI1JJbLPq7Bu5dziH7emHeu7.jpg /a64zCJnqOwHYdFHfdQFqQcxYSAz.jpg NaN
7 899294 Frank and Penelope Thriller-Horror-Crime en A tale of love and violence when a man on his ... 2128.548 NaN 2022-06-03 0.0 0.0 112.0 Released Prey for love. 7.500 37.0 Kevin Dillon-Sean Patrick Flanery-Johnathon Sc... NaN /5NpXoAi3nEQkEgLO09nmotPfyNa.jpg /eyiSLRh44SKKWIJ6bxWq8z1sscB.jpg NaN
8 872177 Corrective Measures Science Fiction-Action en Set in San Tiburon the world's most dangerous ... 1940.324 The Exchange-Tubi TV-Arcana Productions 2022-04-29 0.0 0.0 106.0 Released Anarchy in the world's most dangerous prison. 5.100 35.0 Bruce Willis-Hayley Sales-Michael Rooker-Kat R... based on comic-lockdown-prison measures /aHFq9NMhavOL0jtQvmHQ1c5e0ya.jpg /8Tr79lfoCkOYRg8SYwWit4OoQLi.jpg 1001717-639933-648579-755566-526896-725201
9 774752 The Guardians of the Galaxy Holiday Special Comedy-Science Fiction-Adventure en On a mission to make Christmas unforgettable f... 1916.450 Marvel Studios-Troll Court Entertainment-Kevin... 2022-11-25 0.0 0.0 45.0 Released The perfect present is a galaxy away. 7.465 564.0 Chris Pratt-Dave Bautista-Karen Gillan-Pom Kle... holiday-celebrity-superhero-talking dog-saving... /8dqXyslZ2hv49Oiob9UjlGSHSTR.jpg /rfnmMYuZ6EKOBvQLp2wqP21v7sI.jpg 1010705-35981-1015724-410113-15997-616820-3666...
In [6]:
#Displaying the data type of the dataset attributes 

mv_ds.dtypes
Out[6]:
id                        int64
title                    object
genres                   object
original_language        object
overview                 object
popularity              float64
production_companies     object
release_date             object
budget                  float64
revenue                 float64
runtime                 float64
status                   object
tagline                  object
imdb_ratings            float64
vote_count              float64
credits                  object
keywords                 object
poster_path              object
backdrop_path            object
recommendations          object
dtype: object
In [7]:
#Checking for the missing values in the dataset

mv_ds.isna().any()
Out[7]:
id                      False
title                    True
genres                   True
original_language       False
overview                 True
popularity              False
production_companies     True
release_date             True
budget                  False
revenue                 False
runtime                  True
status                  False
tagline                  True
imdb_ratings            False
vote_count              False
credits                  True
keywords                 True
poster_path              True
backdrop_path            True
recommendations          True
dtype: bool
In [8]:
#No of the missing values in the dataset

mv_ds.isna().sum()
Out[8]:
id                           0
title                        4
genres                  217209
original_language            0
overview                119309
popularity                   0
production_companies    393560
release_date             54900
budget                       0
revenue                      0
runtime                  37082
status                       0
tagline                 626228
imdb_ratings                 0
vote_count                   0
credits                 228937
keywords                523038
poster_path             193438
backdrop_path           511912
recommendations         696916
dtype: int64
In [9]:
#Let's check if there are any movies with same title
mv_ds['title'].duplicated().sum()
Out[9]:
159732
In [10]:
#Let's check if there are any movies with same title and same release date

mv_ds[['title','release_date']].duplicated().sum()
Out[10]:
75377
In [11]:
# lets get rid of the duplicate movies

mv_ds.drop_duplicates(subset=['title','release_date'], inplace=True)
mv_ds.shape
Out[11]:
(660011, 20)
In [12]:
# there are movies that have 0 vote count, so we will consider only those which have at least more than 20 vote counts.

# filtering the movies

mv_ds1 = mv_ds[mv_ds.vote_count >= 20].reset_index()

mv_ds1.isnull().sum()
Out[12]:
index                       0
id                          0
title                       0
genres                    178
original_language           0
overview                  467
popularity                  0
production_companies     3236
release_date                2
budget                      0
revenue                     0
runtime                    15
status                      0
tagline                 20581
imdb_ratings                0
vote_count                  0
credits                   643
keywords                 9664
poster_path               130
backdrop_path            2312
recommendations         15338
dtype: int64
In [13]:
mv_ds1.shape
Out[13]:
(43834, 21)
In [14]:
#Dropping the columns from the dataset that is not needed

mv_ds1.drop('poster_path', axis=1, inplace=True)
mv_ds1.drop('backdrop_path', axis=1, inplace=True)
mv_ds1.drop('recommendations', axis=1, inplace=True)
mv_ds1.drop('keywords', axis=1, inplace=True)
mv_ds1.drop('tagline', axis=1, inplace=True)
mv_ds1.drop('credits', axis=1, inplace=True)
mv_ds1.drop('overview', axis=1, inplace=True)

mv_ds1.shape
Out[14]:
(43834, 14)
In [15]:
mv_ds1.isnull().sum()
Out[15]:
index                      0
id                         0
title                      0
genres                   178
original_language          0
popularity                 0
production_companies    3236
release_date               2
budget                     0
revenue                    0
runtime                   15
status                     0
imdb_ratings               0
vote_count                 0
dtype: int64
In [16]:
# We can remove the null values from the dataset where the count is less . so that we don't loose much data
mv_ds1.dropna(axis=0,subset=['release_date','genres','production_companies'],inplace=True)
mv_ds1.shape
Out[16]:
(40554, 14)
In [17]:
#Replacing the runtime with the median of the value as the graph is right skewed 

mv_ds1["runtime"].fillna(mv_ds1["runtime"].median(),inplace=True)
mv_ds1.head(5)
Out[17]:
index id title genres original_language popularity production_companies release_date budget revenue runtime status imdb_ratings vote_count
0 0 436270 Black Adam Action-Fantasy-Science Fiction en 11752.795 New Line Cinema-Flynn Picture Company-Seven Bu... 2022-10-19 200000000.0 368000000.0 125.0 Released 7.292 2420.0
1 1 724495 The Woman King Action-Drama-History en 4957.725 TriStar Pictures-Entertainment One-JuVee Produ... 2022-09-15 50000000.0 91000000.0 135.0 Released 7.906 586.0
2 2 829799 Paradise City Crime-Action-Thriller en 3133.802 Arcana Studio-308 Enterprises-Yale Productions... 2022-11-11 20000000.0 0.0 93.0 Released 6.153 36.0
3 3 792775 Cop Secret Comedy-Action-Thriller is 2447.908 Pegasus Pictures-Stöð 2-SamFilm 2022-05-23 0.0 0.0 100.0 Released 6.375 28.0
4 5 505642 Black Panther: Wakanda Forever Action-Adventure-Science Fiction en 2248.449 Marvel Studios 2022-11-09 250000000.0 733000000.0 162.0 Released 7.547 1182.0
In [18]:
#No of the missing values in the dataset

mv_ds1.isna().sum()
Out[18]:
index                   0
id                      0
title                   0
genres                  0
original_language       0
popularity              0
production_companies    0
release_date            0
budget                  0
revenue                 0
runtime                 0
status                  0
imdb_ratings            0
vote_count              0
dtype: int64
In [19]:
#Removing the duplicate values in the datset

mv_ds1.drop_duplicates(inplace=True)
mv_ds1.shape
Out[19]:
(40554, 14)
In [20]:
# We need to replace the 0's value in runtime with the median of the value

mv_ds1['runtime'] = mv_ds1['runtime'].replace(0,mv_ds1['runtime'].median())
In [21]:
# We need to replace the 0's value in budget and revenue with the mean of the value 

mv_ds1['budget'] = mv_ds1['budget'].replace(0,mv_ds1['budget'].mean())
mv_ds1['revenue'] = mv_ds1['revenue'].replace(0,mv_ds1['revenue'].mean())
mv_ds1.head(10)
Out[21]:
index id title genres original_language popularity production_companies release_date budget revenue runtime status imdb_ratings vote_count
0 0 436270 Black Adam Action-Fantasy-Science Fiction en 11752.795 New Line Cinema-Flynn Picture Company-Seven Bu... 2022-10-19 2.000000e+08 3.680000e+08 125.0 Released 7.292 2420.0
1 1 724495 The Woman King Action-Drama-History en 4957.725 TriStar Pictures-Entertainment One-JuVee Produ... 2022-09-15 5.000000e+07 9.100000e+07 135.0 Released 7.906 586.0
2 2 829799 Paradise City Crime-Action-Thriller en 3133.802 Arcana Studio-308 Enterprises-Yale Productions... 2022-11-11 2.000000e+07 1.647994e+07 93.0 Released 6.153 36.0
3 3 792775 Cop Secret Comedy-Action-Thriller is 2447.908 Pegasus Pictures-Stöð 2-SamFilm 2022-05-23 6.066852e+06 1.647994e+07 100.0 Released 6.375 28.0
4 5 505642 Black Panther: Wakanda Forever Action-Adventure-Science Fiction en 2248.449 Marvel Studios 2022-11-09 2.500000e+08 7.330000e+08 162.0 Released 7.547 1182.0
5 6 948276 Lost Bullet 2 Action-Drama-Thriller fr 2229.672 Versus Production-Nolita-Inoxy Films 2022-11-10 6.066852e+06 1.647994e+07 98.0 Released 6.650 140.0
7 8 872177 Corrective Measures Science Fiction-Action en 1940.324 The Exchange-Tubi TV-Arcana Productions 2022-04-29 6.066852e+06 1.647994e+07 106.0 Released 5.100 35.0
8 9 774752 The Guardians of the Galaxy Holiday Special Comedy-Science Fiction-Adventure en 1916.450 Marvel Studios-Troll Court Entertainment-Kevin... 2022-11-25 6.066852e+06 1.647994e+07 45.0 Released 7.465 564.0
9 10 846778 Margaux Horror-Science Fiction en 1751.341 Motion Picture Corporation of America-Lighthou... 2022-09-09 6.066852e+06 1.647994e+07 105.0 Released 6.800 43.0
10 11 830784 Lyle, Lyle, Crocodile Comedy-Family-Music en 1710.176 Columbia Pictures-Eagle Pictures-TSG Entertain... 2022-10-07 5.000000e+07 7.976194e+07 106.0 Released 7.810 124.0
In [22]:
#Creating a new column to check the net profit made by the production_companies (revenue-budget) 

mv_ds1["Profit"]=mv_ds1['revenue'].sub(mv_ds1['budget'], axis = 0) 

mv_ds1
Out[22]:
index id title genres original_language popularity production_companies release_date budget revenue runtime status imdb_ratings vote_count Profit
0 0 436270 Black Adam Action-Fantasy-Science Fiction en 11752.795 New Line Cinema-Flynn Picture Company-Seven Bu... 2022-10-19 2.000000e+08 3.680000e+08 125.0 Released 7.292 2420.0 1.680000e+08
1 1 724495 The Woman King Action-Drama-History en 4957.725 TriStar Pictures-Entertainment One-JuVee Produ... 2022-09-15 5.000000e+07 9.100000e+07 135.0 Released 7.906 586.0 4.100000e+07
2 2 829799 Paradise City Crime-Action-Thriller en 3133.802 Arcana Studio-308 Enterprises-Yale Productions... 2022-11-11 2.000000e+07 1.647994e+07 93.0 Released 6.153 36.0 -3.520059e+06
3 3 792775 Cop Secret Comedy-Action-Thriller is 2447.908 Pegasus Pictures-Stöð 2-SamFilm 2022-05-23 6.066852e+06 1.647994e+07 100.0 Released 6.375 28.0 1.041309e+07
4 5 505642 Black Panther: Wakanda Forever Action-Adventure-Science Fiction en 2248.449 Marvel Studios 2022-11-09 2.500000e+08 7.330000e+08 162.0 Released 7.547 1182.0 4.830000e+08
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
43829 584853 174323 G.B.F. Comedy-Drama en 0.600 School Pictures-Parting Shots Media-Logolite E... 2013-04-19 3.200000e+06 1.647994e+07 93.0 Released 6.000 366.0 1.327994e+07
43830 587770 182219 Serial Teachers Comedy fr 0.600 UGC 2013-04-17 1.200000e+07 1.647994e+07 88.0 Released 5.470 1109.0 4.479941e+06
43831 609313 184374 Cinco de Mayo: The Battle War-History-Drama es 0.600 Estudios Churubusco Azteca-Gala Films-Gobierno... 2013-05-03 1.000000e+07 1.647994e+07 125.0 Released 7.700 67.0 6.479941e+06
43832 687240 510819 Dirty Dead Con Men Action-Crime-Drama en 0.600 Rock n' Tape Films-Thunder Alley Productions-N... 2018-03-30 6.066852e+06 1.647994e+07 85.0 Released 4.000 20.0 1.041309e+07
43833 711541 505039 Illicit Desires Thriller en 0.600 Retromedia Entertainment 2018-04-03 6.066852e+06 1.647994e+07 81.0 Released 4.500 20.0 1.041309e+07

40554 rows × 15 columns

In [23]:
#Creating a new column to check the profit percentage made by the company 

mv_ds1['Profit_Percentage']=(mv_ds1['Profit']/mv_ds1['budget'])*100
mv_ds1
Out[23]:
index id title genres original_language popularity production_companies release_date budget revenue runtime status imdb_ratings vote_count Profit Profit_Percentage
0 0 436270 Black Adam Action-Fantasy-Science Fiction en 11752.795 New Line Cinema-Flynn Picture Company-Seven Bu... 2022-10-19 2.000000e+08 3.680000e+08 125.0 Released 7.292 2420.0 1.680000e+08 84.000000
1 1 724495 The Woman King Action-Drama-History en 4957.725 TriStar Pictures-Entertainment One-JuVee Produ... 2022-09-15 5.000000e+07 9.100000e+07 135.0 Released 7.906 586.0 4.100000e+07 82.000000
2 2 829799 Paradise City Crime-Action-Thriller en 3133.802 Arcana Studio-308 Enterprises-Yale Productions... 2022-11-11 2.000000e+07 1.647994e+07 93.0 Released 6.153 36.0 -3.520059e+06 -17.600296
3 3 792775 Cop Secret Comedy-Action-Thriller is 2447.908 Pegasus Pictures-Stöð 2-SamFilm 2022-05-23 6.066852e+06 1.647994e+07 100.0 Released 6.375 28.0 1.041309e+07 171.639059
4 5 505642 Black Panther: Wakanda Forever Action-Adventure-Science Fiction en 2248.449 Marvel Studios 2022-11-09 2.500000e+08 7.330000e+08 162.0 Released 7.547 1182.0 4.830000e+08 193.200000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
43829 584853 174323 G.B.F. Comedy-Drama en 0.600 School Pictures-Parting Shots Media-Logolite E... 2013-04-19 3.200000e+06 1.647994e+07 93.0 Released 6.000 366.0 1.327994e+07 414.998153
43830 587770 182219 Serial Teachers Comedy fr 0.600 UGC 2013-04-17 1.200000e+07 1.647994e+07 88.0 Released 5.470 1109.0 4.479941e+06 37.332841
43831 609313 184374 Cinco de Mayo: The Battle War-History-Drama es 0.600 Estudios Churubusco Azteca-Gala Films-Gobierno... 2013-05-03 1.000000e+07 1.647994e+07 125.0 Released 7.700 67.0 6.479941e+06 64.799409
43832 687240 510819 Dirty Dead Con Men Action-Crime-Drama en 0.600 Rock n' Tape Films-Thunder Alley Productions-N... 2018-03-30 6.066852e+06 1.647994e+07 85.0 Released 4.000 20.0 1.041309e+07 171.639059
43833 711541 505039 Illicit Desires Thriller en 0.600 Retromedia Entertainment 2018-04-03 6.066852e+06 1.647994e+07 81.0 Released 4.500 20.0 1.041309e+07 171.639059

40554 rows × 16 columns

In [24]:
#Most of the values for the status is Released we can drop the status column

mv_ds1.drop('status',axis=1,inplace=True)
In [25]:
mv_ds1.shape
mv_ds1.head(5)
Out[25]:
index id title genres original_language popularity production_companies release_date budget revenue runtime imdb_ratings vote_count Profit Profit_Percentage
0 0 436270 Black Adam Action-Fantasy-Science Fiction en 11752.795 New Line Cinema-Flynn Picture Company-Seven Bu... 2022-10-19 2.000000e+08 3.680000e+08 125.0 7.292 2420.0 1.680000e+08 84.000000
1 1 724495 The Woman King Action-Drama-History en 4957.725 TriStar Pictures-Entertainment One-JuVee Produ... 2022-09-15 5.000000e+07 9.100000e+07 135.0 7.906 586.0 4.100000e+07 82.000000
2 2 829799 Paradise City Crime-Action-Thriller en 3133.802 Arcana Studio-308 Enterprises-Yale Productions... 2022-11-11 2.000000e+07 1.647994e+07 93.0 6.153 36.0 -3.520059e+06 -17.600296
3 3 792775 Cop Secret Comedy-Action-Thriller is 2447.908 Pegasus Pictures-Stöð 2-SamFilm 2022-05-23 6.066852e+06 1.647994e+07 100.0 6.375 28.0 1.041309e+07 171.639059
4 5 505642 Black Panther: Wakanda Forever Action-Adventure-Science Fiction en 2248.449 Marvel Studios 2022-11-09 2.500000e+08 7.330000e+08 162.0 7.547 1182.0 4.830000e+08 193.200000
In [26]:
#DATA VISUALIZATION

#Displaying Ratings counts

%matplotlib inline
import matplotlib.pyplot as plt

score=mv_ds1["imdb_ratings"]

font = {'fontname':'Arial', 'size':'14'}
title_font = { 'weight' : 'bold','size':'16'}
plt.hist(score, bins=20)
plt.title("Distribution of the IMDB ratings")
plt.show()
In [27]:
# Correlation with heat map

plt.figure(figsize=(8,6))
sns.heatmap(mv_ds1.corr(),annot=True,fmt='.2f', cmap='viridis')
Out[27]:
<AxesSubplot:>
In [28]:
#Checking for the movies released year wise 

mv_ds1['year'] = pd.DatetimeIndex(mv_ds1['release_date']).year

(ggplot(mv_ds1)         # defining what data to use
 + aes(x='year')    # defining what variable to use
 + geom_bar(size=20) # defining the type of plot to use
)
Out[28]:
<ggplot: (150631231202)>
In [29]:
#Dropping release_date column

mv_ds1.drop('release_date',axis=1,inplace=True)
In [30]:
#Relationship between the imdb ratings and the profit made by the movie 

ggplot(mv_ds1) +\
    aes(x='imdb_ratings', y='Profit') +\
    geom_line() +\
    stat_smooth(colour='green', span=1)
Out[30]:
<ggplot: (150631332060)>
In [31]:
#We can see that there is strong corelation between the imdb_score and the profit . 
#The movies with high imdb rating have made more profit

#Lets check Relationship between imdb_ratings and profit percentage

ggplot(mv_ds1) +\
    aes(x='imdb_ratings', y='Profit_Percentage') +\
    geom_line() +\
    stat_smooth(colour='green', span=1)
Out[31]:
<ggplot: (150631360558)>
In [32]:
#Top 10 movies based on the profit they made

plt.figure(figsize=(7,6))
mv_ds1 = mv_ds1.sort_values(by ='Profit' , ascending=False)
mv_ds1_new = mv_ds1.head(10)
a=sns.pointplot(mv_ds1_new['Profit'], mv_ds1_new['budget'], hue=mv_ds1_new['title'])
a.set_xticklabels(a.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
In [33]:
#Top 10 movies based on the imdb_ratings

plt.figure(figsize=(10,8))
mv_ds1 = mv_ds1.sort_values(by ='imdb_ratings' , ascending=False)
mv_ds1_new = mv_ds1.head(10)
a=sns.pointplot(mv_ds1_new['id'], mv_ds1_new['imdb_ratings'], hue=mv_ds1_new['title'])
a.set_xticklabels(a.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
In [34]:
mv_ds1.head(10)
Out[34]:
index id title genres original_language popularity production_companies budget revenue runtime imdb_ratings vote_count Profit Profit_Percentage year
40444 77710 571278 Scooby-Doo's Original Mysteries Animation-Adventure en 2.538 Hanna-Barbera Productions 6.066852e+06 1.647994e+07 110.0 10.0 23.0 1.041309e+07 171.639059 2000
23432 28815 392622 What's New, Scooby-Doo? Vol. 7: Ready to Scare Animation-Comedy-Family-Mystery en 6.206 Hanna-Barbera Productions-Cartoon Network Studios 6.066852e+06 1.647994e+07 97.0 10.0 42.0 1.041309e+07 171.639059 2006
39167 69069 638440 Scooby-Doo: Agence toutou risques, vol. 2 : Le... Animation-Family-Comedy fr 2.819 Hanna-Barbera Productions 6.066852e+06 1.647994e+07 97.0 10.0 21.0 1.041309e+07 171.639059 2007
35498 53109 495686 What's New Scooby-Doo? Vol. 4: Merry Scary Hol... Animation-Comedy-Science Fiction-Action-Family en 3.558 Warner Bros. Pictures-Hanna-Barbera Production... 6.066852e+06 1.647994e+07 85.0 10.0 41.0 1.041309e+07 171.639059 2007
36231 55602 609737 The 1st 13th Annual Fancy Anvil Awards Show Pr... Animation-Family-Comedy en 3.414 Cartoon Network-Hanna-Barbera Productions-Cart... 6.066852e+06 1.647994e+07 120.0 10.0 35.0 1.041309e+07 171.639059 2002
21769 26422 414119 What's New Scooby-Doo? Vol. 3: Halloween Boos ... Animation-Comedy-Family-Mystery en 6.702 Hanna-Barbera Productions-Cartoon Network Studios 6.066852e+06 1.647994e+07 84.0 10.0 44.0 1.041309e+07 171.639059 2007
42131 96325 638443 Scooby-Doo: Agence toutou risques, vol. 1 : Le... Animation-Family-Comedy fr 2.064 Hanna-Barbera Productions 6.066852e+06 1.647994e+07 97.0 10.0 21.0 1.041309e+07 171.639059 2007
34409 49786 405794 Cartoon Network Christmas: Yuletide Follies Animation-Family-Comedy en 3.765 Turner Home Entertainment-Cartoon Network-Hann... 6.066852e+06 1.647994e+07 110.0 9.9 40.0 1.041309e+07 171.639059 2004
40842 80914 642488 Mickey's Safety Club: Street Safe, Street Smart Family-Animation-Comedy-Music-TV Movie en 2.440 Disney Educational Productions-Walt Disney Stu... 6.066852e+06 1.647994e+07 13.0 9.9 30.0 1.041309e+07 171.639059 1989
25357 31600 386024 What's New, Scooby-Doo? Vol. 7: Ghosts on the Go! Animation-Comedy-Family-Mystery en 5.710 Warner Bros. Pictures 6.066852e+06 1.647994e+07 87.0 9.9 42.0 1.041309e+07 171.639059 2006
In [35]:
#Removing the Columns with names

mv_ds1.drop('title', axis=1, inplace=True)
mv_ds1.drop('production_companies', axis=1, inplace=True)
In [36]:
#Remove the linear dependant variables

mv_ds1.drop('Profit', axis=1, inplace=True)
mv_ds1.drop('Profit_Percentage', axis=1, inplace=True)
In [37]:
#Remove the column that is not needed
mv_ds1.drop('popularity', axis=1, inplace=True)
mv_ds1.drop('index', axis=1, inplace=True)
mv_ds1.drop('id', axis=1, inplace=True)
In [38]:
value_counts=mv_ds1["original_language"].value_counts()
print(value_counts)
en    25102
fr     3438
it     1970
ja     1629
es     1465
      ...  
iu        1
se        1
ne        1
qu        1
eo        1
Name: original_language, Length: 84, dtype: int64
In [39]:
vals = value_counts[:1].index
print (vals)
mv_ds1['original_language'] = mv_ds1.original_language.where(mv_ds1.original_language.isin(vals), 'other')
Index(['en'], dtype='object')
In [40]:
mv_ds1["original_language"].value_counts()
Out[40]:
en       25102
other    15452
Name: original_language, dtype: int64
In [41]:
#Dropping vote count table

mv_ds1.drop('vote_count', axis=1, inplace=True)
In [42]:
mv_ds1.drop('genres', axis=1, inplace=True)
In [43]:
#Assigning dummies values to string data

mv_ds1 = pd.get_dummies(data = mv_ds1, columns = ['original_language'] , prefix = ['language'])
In [44]:
# We need to categorize the imdb values in the range of 0-4,4-6,6-8 and 8-10 to mark them as the bad,average,good and excellent movies respectively

mv_ds1["imdb_score"]=pd.cut(mv_ds1['imdb_ratings'], bins=[0,4,6,8,10], right=True, labels=False)+1
In [45]:
#Dropping the imdb_ratings column as it is being replaced with the imdb_score values 

mv_ds1.drop('imdb_ratings',axis=1,inplace=True)
In [46]:
mv_ds1.head(5)
Out[46]:
budget revenue runtime year language_en language_other imdb_score
40444 6.066852e+06 1.647994e+07 110.0 2000 1 0 4
23432 6.066852e+06 1.647994e+07 97.0 2006 1 0 4
39167 6.066852e+06 1.647994e+07 97.0 2007 0 1 4
35498 6.066852e+06 1.647994e+07 85.0 2007 1 0 4
36231 6.066852e+06 1.647994e+07 120.0 2002 1 0 4
In [47]:
mv_ds1.isnull().sum()
Out[47]:
budget            0
revenue           0
runtime           0
year              0
language_en       0
language_other    0
imdb_score        0
dtype: int64
In [48]:
mv_ds1.columns
Out[48]:
Index(['budget', 'revenue', 'runtime', 'year', 'language_en', 'language_other',
       'imdb_score'],
      dtype='object')
In [49]:
#Splitting the data into training and test data

X=pd.DataFrame(columns=['runtime', 'budget', 'revenue', 'year', 'language_en'], data=mv_ds1)
y=pd.DataFrame(columns=['imdb_score'],data=mv_ds1)
In [50]:
#Create train and test data set
, 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=100)
In [51]:
#Feature scaling

from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
In [52]:
#Models

#KNN Model

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn_model = KNeighborsClassifier()

#Fit the model with train data
knn_model.fit(X_train,y_train)

#Predict the target on train data set
knnpred = knn_model.predict(X_train)
print(knnpred)
train_knn = accuracy_score(y_train, knnpred)
print("\nAccuracy score on train dataset:", train_knn)

from sklearn import metrics

cnf_matrix_train = metrics.confusion_matrix(y_train, knnpred)
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_train)
[2 3 2 ... 3 2 2]

Accuracy score on train dataset: 0.7274456617465741

Train Data Confusin Matrix:
 [[   84   279   168     0]
 [   69  5479  3931     1]
 [   51  2750 15061    12]
 [    1    97   378    26]]
In [53]:
#Predict the target on train data set
knnpred1 = knn_model.predict(X_test)
print(knnpred1)
test_knn = accuracy_score(y_test, knnpred1)
print("\nAccuracy score on test dataset:", test_knn)

from sklearn import metrics

cnf_matrix_test = metrics.confusion_matrix(y_test, knnpred1)
print("\nTest Data Confusin Matrix:\n", cnf_matrix_test)
[3 3 2 ... 3 3 3]

Accuracy score on test dataset: 0.6411605161502425

Test Data Confusin Matrix:
 [[  13  135   80    0]
 [  35 1831 2217    1]
 [  30 1642 5953   12]
 [   0   29  185    4]]
In [54]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

#Fit the model with train data
rfc.fit(X_train,y_train)

#Predict the target on train data set

rfc_pred = rfc.predict(X_train)
print(rfc_pred)

train_rfc = accuracy_score(y_train, rfc_pred)
print("\nAccuracy score on train dataset:", train_rfc)

from sklearn import metrics

cnf_matrix_rfc = metrics.confusion_matrix(y_train, rfc_pred)
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_rfc)
[2 3 2 ... 3 2 2]

Accuracy score on train dataset: 0.8618733927502026

Train Data Confusin Matrix:
 [[  246   167   118     0]
 [   12  7540  1924     4]
 [   10  1503 16355     6]
 [    0    41   136   325]]
In [55]:
#Predict the target on test data set

rfc_pred1 = rfc.predict(X_test)
print(rfc_pred1)

test_rfc = accuracy_score(y_test, rfc_pred1)
print("\nAccuracy score on test dataset:", test_rfc)

from sklearn import metrics

cnf_matrix_rfc1 = metrics.confusion_matrix(y_test, rfc_pred1)
print("\nTest Data Confusin Matrix:\n", cnf_matrix_rfc1)
[3 3 3 ... 3 3 3]

Accuracy score on test dataset: 0.6375441768718665

Test Data Confusin Matrix:
 [[   6  133   88    1]
 [  29 1783 2265    7]
 [  29 1607 5965   36]
 [   1   25  189    3]]
In [56]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dtree = DecisionTreeClassifier()

#Fit the model with train data
dtree.fit(X_train,y_train)

#Predict the target on train data set
dtree_pred = dtree.predict(X_train)
print(dtree_pred)

train_dtree = accuracy_score(y_train, dtree_pred)
print("\nAccuracy score on train dataset:", train_dtree)

from sklearn import metrics

cnf_matrix_dtree = metrics.confusion_matrix(y_train, dtree_pred)
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_dtree)
[2 2 2 ... 3 2 2]

Accuracy score on train dataset: 0.8619086201430232

Train Data Confusin Matrix:
 [[  302   143    86     0]
 [   50  8063  1367     0]
 [   43  2046 15784     1]
 [    1    57   126   318]]
In [57]:
#Predict the target on test data set

dtree_pred1 = dtree.predict(X_test)
print(dtree_pred1)

test_dtree = accuracy_score(y_test, dtree_pred1)
print("\nAccuracy score on test dataset:", test_dtree)

from sklearn import metrics

cnf_matrix_dtree1 = metrics.confusion_matrix(y_test, dtree_pred1)
print("\nTest Data Confusin Matrix:\n", cnf_matrix_dtree1)
[3 3 2 ... 3 3 3]

Accuracy score on test dataset: 0.5935727788279773

Test Data Confusin Matrix:
 [[  13  135   78    2]
 [  86 2007 1977   14]
 [  73 2235 5188  141]
 [   2   46  156   14]]
In [58]:
#Linear Regression

from sklearn.linear_model import LinearRegression


linreg = LinearRegression()

#Fit the model with train data
linreg.fit(X_train,y_train)

#Predict the target on train data set
linreg_pred = linreg.predict(X_train)
print(linreg_pred)

train_linreg = accuracy_score(y_train, linreg_pred.round())

print("\nAccuracy score on train dataset:", train_linreg)

from sklearn import metrics

cnf_matrix_linreg = metrics.confusion_matrix(y_train, linreg_pred.round())
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_linreg)
[[2.5304214 ]
 [2.82431788]
 [2.4951687 ]
 ...
 [2.70776624]
 [2.721521  ]
 [2.6848867 ]]

Accuracy score on train dataset: 0.6079543452989045

Train Data Confusin Matrix:
 [[    0     5   526     0     0]
 [    0   132  9348     0     0]
 [    0   738 17120    16     0]
 [    0    43   452     6     1]
 [    0     0     0     0     0]]
In [59]:
#Predict the target on test data set
linreg_pred1 = linreg.predict(X_test)
print(linreg_pred1)

test_linreg = accuracy_score(y_test, linreg_pred1.round())

print("\nAccuracy score on test dataset:", test_linreg)

from sklearn import metrics

cnf_matrix_linreg1 = metrics.confusion_matrix(y_test, linreg_pred1.round())
print("\nTest Data Confusin Matrix:\n", cnf_matrix_linreg1)
[[2.61446715]
 [2.70317356]
 [2.62081408]
 ...
 [2.77506609]
 [2.55518263]
 [2.69697516]]

Accuracy score on test dataset: 0.6073806197090491

Test Data Confusin Matrix:
 [[   0    1  227    0    0]
 [   0   60 4024    0    0]
 [   0  300 7330    6    1]
 [   0   19  199    0    0]
 [   0    0    0    0    0]]
In [60]:
#Model Comparison

#classification report for train data on all models

from sklearn.metrics import classification_report

print('classification report for train data on all models\n')

print('KNN Reports\n',classification_report(y_train, knnpred))
print('Random Forests Reports\n',classification_report(y_train, rfc_pred))
print('Decision Tree Reports\n',classification_report(y_train, dtree_pred))
print('Linear Regression Reports\n',classification_report(y_train, linreg_pred.round()))
classification report for train data on all models

KNN Reports
               precision    recall  f1-score   support

           1       0.41      0.16      0.23       531
           2       0.64      0.58      0.61      9480
           3       0.77      0.84      0.81     17874
           4       0.67      0.05      0.10       502

    accuracy                           0.73     28387
   macro avg       0.62      0.41      0.43     28387
weighted avg       0.72      0.73      0.72     28387

Random Forests Reports
               precision    recall  f1-score   support

           1       0.92      0.46      0.62       531
           2       0.82      0.80      0.81      9480
           3       0.88      0.92      0.90     17874
           4       0.97      0.65      0.78       502

    accuracy                           0.86     28387
   macro avg       0.90      0.71      0.77     28387
weighted avg       0.86      0.86      0.86     28387

Decision Tree Reports
               precision    recall  f1-score   support

           1       0.76      0.57      0.65       531
           2       0.78      0.85      0.81      9480
           3       0.91      0.88      0.90     17874
           4       1.00      0.63      0.77       502

    accuracy                           0.86     28387
   macro avg       0.86      0.73      0.78     28387
weighted avg       0.87      0.86      0.86     28387

Linear Regression Reports
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       531
         2.0       0.14      0.01      0.03      9480
         3.0       0.62      0.96      0.76     17874
         4.0       0.27      0.01      0.02       502
         5.0       0.00      0.00      0.00         0

    accuracy                           0.61     28387
   macro avg       0.21      0.20      0.16     28387
weighted avg       0.45      0.61      0.48     28387

In [61]:
#classification report for test data on all models

from sklearn.metrics import classification_report

print('classification report for test data on all models\n')

print('KNN Reports\n',classification_report(y_test, knnpred1))
print('Random Forests Reports\n',classification_report(y_test, rfc_pred1))
print('Decision Tree Reports\n',classification_report(y_test, dtree_pred1))
print('Linear Regression Reports\n',classification_report(y_test, linreg_pred1.round()))
classification report for test data on all models

KNN Reports
               precision    recall  f1-score   support

           1       0.17      0.06      0.08       228
           2       0.50      0.45      0.47      4084
           3       0.71      0.78      0.74      7637
           4       0.24      0.02      0.03       218

    accuracy                           0.64     12167
   macro avg       0.40      0.33      0.33     12167
weighted avg       0.62      0.64      0.63     12167

Random Forests Reports
               precision    recall  f1-score   support

           1       0.09      0.03      0.04       228
           2       0.50      0.44      0.47      4084
           3       0.70      0.78      0.74      7637
           4       0.06      0.01      0.02       218

    accuracy                           0.64     12167
   macro avg       0.34      0.31      0.32     12167
weighted avg       0.61      0.64      0.62     12167

Decision Tree Reports
               precision    recall  f1-score   support

           1       0.07      0.06      0.06       228
           2       0.45      0.49      0.47      4084
           3       0.70      0.68      0.69      7637
           4       0.08      0.06      0.07       218

    accuracy                           0.59     12167
   macro avg       0.33      0.32      0.32     12167
weighted avg       0.60      0.59      0.59     12167

Linear Regression Reports
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       228
         2.0       0.16      0.01      0.03      4084
         3.0       0.62      0.96      0.76      7637
         4.0       0.00      0.00      0.00       218
         5.0       0.00      0.00      0.00         0

    accuracy                           0.61     12167
   macro avg       0.16      0.19      0.16     12167
weighted avg       0.44      0.61      0.48     12167

In [ ]: